Load library

rm(list = ls())
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readxl)
library(dplyr) 
library(ggplot2)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(here)
## here() starts at /Users/sukem09
library(readxl)
library(dplyr)
library(magrittr)
## 
## Attaching package: 'magrittr'
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(quantmod)
## Loading required package: xts
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tsibble)
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:zoo':
## 
##     index
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(stats)
library(purrr)
library(rlang)
## 
## Attaching package: 'rlang'
## 
## The following object is masked from 'package:magrittr':
## 
##     set_names
## 
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
library(forecast)

a. Import dataset

tsdata <- read_excel("~/Desktop/case study/PredictorData2022.xlsx")
ts_quarter <- read_excel("~/Desktop/case study/PredictorData2022.xlsx", sheet = "Quarterly")
ts_year <- read_excel("~/Desktop/case study/PredictorData2022.xlsx", sheet = "Annual")


tsdata$yyyymm <- as.yearmon(as.character(tsdata$yyyymm), format="%Y%m")
ts_quarter$yyyyq <- as.yearqtr(as.character(ts_quarter$yyyyq), format="%Y%q")
ts_year$yyyy <- as.Date(as.character(ts_year$yyyy), format="%Y")

tsdata %<>% select(-CRSP_SPvw, -CRSP_SPvwx)
ts_quarter %<>% select(-CRSP_SPvw, -CRSP_SPvwx)
ts_year %<>% select(-CRSP_SPvw, -CRSP_SPvwx)
vars_monthly <- names(tsdata)
vars_quarterly <-names(ts_quarter) #Contains additional varibales: cay, ik, D3, E3
vars_annual <-names(ts_year) #Contains additional varibales: eqis, ik

setdiff(vars_quarterly,vars_monthly)
## [1] "yyyyq" "cay"   "ik"    "D3"    "E3"
setdiff(vars_annual,vars_monthly)
## [1] "yyyy" "cay"  "eqis" "ik"
# Rename the column
colname_mapping <- c(
"time" = "yyyymm",
  "time" = "yyyyq",
  "time" = "yyyy",
  "sp_index" = "Index",
  "dividend_12" = "D12",
  "earning_12" = "E12",
  "book_market" = "b/m",
  "treasury_bill" = "tbl",
  "cor_AAA" = "AAA",
  "cor_BAA" = "BAA",
  "lt_yeild" = "lty",
  "net_equity" = "ntis",
  "risk_free" = "Rfree",
  "inflation" = "infl",
  "lt_returnrate" = "ltr",
  "cbond_return" = "corpr",
  "stock_var" = "svar",
  "cs_premium" = "csp",
  "consum_weal_incomeratio" = "cay",
  "investment_capitalratio" = "ik",
  "dividend3year" = "D3",
  "earning3year" = "E3",
  "consum_wealth_monthly" = "caym",
  "equity_issuing" = "eqis"
)

#Rename columns 
filtered_dict_monthly <- colname_mapping[colname_mapping %in% colnames(tsdata)]
tsdata <- rename(tsdata, !!!filtered_dict_monthly)

filtered_dict_quarterly <- colname_mapping[colname_mapping %in% colnames(ts_quarter)]
ts_quarter <- rename(ts_quarter, !!!filtered_dict_quarterly)

filtered_dict_annual <- colname_mapping[colname_mapping %in% colnames(ts_year)]
ts_year <- rename(ts_year, !!!filtered_dict_annual)

Format dataframe

# Transform variables into numeric
for(i in names(tsdata)) {
  if(class(tsdata[[i]]) == "character") {
    tsdata[[i]] <- as.numeric(tsdata[[i]])
  }
}

for(i in names(ts_quarter)) {
  if(class(ts_quarter[[i]]) == "character") {
    ts_quarter[[i]] <- as.numeric(ts_quarter[[i]])
  }
}

for(i in names(ts_year)) {
  if(class(ts_year[[i]]) == "character") {
    ts_year[[i]] <- as.numeric(ts_year[[i]])
  }
}

b. Generate the excess returns series

calculate_returns <- function(input){
  input %<>% mutate(returns = as.vector(quantmod::Delt(sp_index))) 
  input %<>% mutate(excess_returns = returns - risk_free)
  return(input)
}
# Create subsets for different frequencies --------------------------------
tsdata %<>% calculate_returns()
ts_quarter %<>% calculate_returns()
ts_year %<>% calculate_returns()

Plot time series of all variables

sp_index_plot <- ggplot(tsdata, aes(x = time, y = sp_index)) +
  geom_line() +
  labs(title = "Time Series Plot of S&P 500 Index", x = "Time", y = "S&P 500 Index")
print(sp_index_plot)

vars_for_ts_plot <- setdiff(names(tsdata) , c("time", "sp_index_lag1m"))
vars_for_scatter_plot <-  setdiff(names(tsdata) , c("time", "sp_500", "lag_index", "returns", "excess_returns"))

plot_list_ts <- list()
plot_list_lagged <- list()
# Generate one scatter plot against excess return
for(i in 1:length(vars_for_ts_plot)) {
  title_text <- paste("(", letters[i], ") ", vars_for_ts_plot[i], sep = "")
  
  current_plot_ts <- ggplot(data=tsdata, aes(x=time, y=lag(!!sym(vars_for_ts_plot[i])))) +
    geom_line() + 
    ggtitle(title_text) +
    theme_bw()  
  
  plot_list_ts[[i]] <- current_plot_ts
  
}
# Create a grid layout of the plots
grid.arrange(grobs = plot_list_ts[1:6], ncol = 2)

grid.arrange(grobs = plot_list_ts[7:12], ncol = 2)

grid.arrange(grobs = plot_list_ts[13:18], ncol = 2)

vars_for_ts_plotq <- setdiff(names(ts_quarter) , c("time", "sp_index_lag1m"))
vars_for_scatter_plotq <-  setdiff(names(ts_quarter) , c("time", "sp_500", "lag_index", "returns", "excess_returns"))

plot_list_tsq <- list()
plot_list_laggedq <- list()
for(i in 1:length(vars_for_ts_plotq)) {
  title_text <- paste("(", letters[i], ") ", vars_for_ts_plotq[i], sep = "")
  
  current_plot_tsq <- ggplot(data=ts_quarter, aes(x=time, y=lag(!!sym(vars_for_ts_plotq[i])))) +
    geom_line() + 
    ggtitle(title_text) +
    theme_bw()  
  
  plot_list_ts[[i]] <- current_plot_tsq
  
}
# Create a grid layout of the plots
grid.arrange(grobs = plot_list_ts[1:6], ncol = 2)

grid.arrange(grobs = plot_list_ts[7:12], ncol = 2)

grid.arrange(grobs = plot_list_ts[13:18], ncol = 2)

grid.arrange(grobs = plot_list_ts[19:14], ncol = 2)

c. Structure of excess return series

Plot excess return series

excess_returns_plot <- ggplot(tsdata, aes(x = time, y = excess_returns)) +
  geom_line() +
  labs(title = "Time Series Plot of Monthly Excess Returns", x = "Time", y = "Excess Return")
print(excess_returns_plot)

excess_returns_plotq <- ggplot(ts_quarter, aes(x = time, y = excess_returns)) +
  geom_line() +
  labs(title = "Time Series Plot of Quaterly Excess Returns", x = "Time", y = "Excess Return")

print(excess_returns_plotq)

excess_returns_ploty <- ggplot(ts_year, aes(x = time, y = excess_returns)) +
  geom_line() +
  labs(title = "Time Series Plot of Yearly Excess Returns", x = "Time", y = "Excess Return")

print(excess_returns_ploty)

summary(tsdata$excess_returns)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -0.299723 -0.022091  0.003065  0.001912  0.028665  0.421222         1
summary(ts_quarter$excess_returns)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## -0.399606 -0.040886  0.011416  0.006462  0.058085  0.861607         1
summary(ts_year$excess_returns)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.48035 -0.08643  0.02497  0.02539  0.15412  0.46217        1

Plot acf and pacf

Monthly

excess_rt <- na.omit(tsdata$excess_returns)
acf_result <- Acf(excess_rt, lag.max = 60, main = "ACF of Excess Return")

pacf_result <- pacf(excess_rt,lag.max = 60, main = "PACF of Excess Return")

Look more likely AR(1) model

Quaterly

excess_rtq <- na.omit(ts_quarter$excess_returns)
acf_resultq <- Acf(excess_rtq, lag.max = 60, main = "ACF of Excess Return")

pacf_resultq <- pacf(excess_rtq,lag.max = 60, main = "PACF of Excess Return")

Look like AR (10)

Yearly

excess_rty <- na.omit(ts_year$excess_returns)
acf_resulty <- Acf(excess_rty, main = "ACF of Excess Return")

pacf_resulty <- pacf(excess_rty, main = "PACF of Excess Return")

Look like no correlation

d.e. Fit an AR models base on Information criteria and generate forecast

For monthly data

ar_modelling <- function(input) {
  data_cleaned <- input %>% filter(!is.na(excess_returns))
  
  p_values <- 1:2
  bic_values <- numeric(length(p_values))
  ar_models <- list()
  
  for (p in p_values) {
    AR_model <- arima(data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
    
    bic_values[p] <- AIC(AR_model)
    
    ar_models[[p]] <- AR_model
  }
  
  best_p <- p_values[which.min(bic_values)]
  best_AR <- ar_models[[best_p]]
  
  # Calculate fitted values
  fitted_values <- data_cleaned$excess_returns - best_AR$residuals
  
  plot_data <- data.frame(
    Date = as.Date(input$time[!is.na(input$excess_returns)]),
    ExcessReturns = data_cleaned$excess_returns,
    Fitted = fitted_values
  )
  
  MSFE <- sqrt(mean(best_AR$residuals^2))
  
  # Plot data
  plot <- ggplot(plot_data, aes(x = Date)) +
    geom_line(aes(y = ExcessReturns), color = "black") +
    geom_line(aes(y = Fitted), color = "red") +
    labs(title = paste("Time Series with AR(", best_p, ") Fitted Values")) +
    theme_minimal()
  
  result <- list(best_p = best_p, bic_values = bic_values, AR_model = best_AR, MSFE = MSFE, plot = plot)
  
  return(result)
}
result <- ar_modelling(input = tsdata)
print(paste("Best AR(p) order:", result$best_p))
## [1] "Best AR(p) order: 1"
summary(result$AR_model)
## 
## Call:
## arima(x = data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
## 
## Coefficients:
##          ar1  intercept
##       0.1145     0.0019
## s.e.  0.0233     0.0012
## 
## sigma^2 estimated as 0.002221:  log likelihood = 2982.43,  aic = -5958.87
## 
## Training set error measures:
##               ME RMSE MAE MPE MAPE
## Training set NaN  NaN NaN NaN  NaN
print(paste("AIC values for p =", 1:5, ":", result$bic_values))
## [1] "AIC values for p = 1 : -5958.86991226427"
## [2] "AIC values for p = 2 : -5958.38195971994"
## [3] "AIC values for p = 3 : -5958.86991226427"
## [4] "AIC values for p = 4 : -5958.38195971994"
## [5] "AIC values for p = 5 : -5958.86991226427"
print(paste("MSFE:", result$MSFE))
## [1] "MSFE: 0.0471252669186668"
print(result$plot)

ar_modelling <- function(input) {
  data_cleaned <- input %>% filter(!is.na(excess_returns))
  
  p_values <- 1:5
  bic_values <- numeric(length(p_values))
  ar_models <- list()
  
  for (p in p_values) {
    AR_model <- arima(data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
    
    bic_values[p] <- AIC(AR_model)
    
    ar_models[[p]] <- AR_model
  }
  
  best_p <- p_values[which.min(bic_values)]
  best_AR <- ar_models[[best_p]]
  
  # Calculate fitted values
  fitted_values <- data_cleaned$excess_returns - best_AR$residuals
  
  plot_data <- data.frame(
    Date = as.Date(input$time[!is.na(input$excess_returns)]),
    ExcessReturns = data_cleaned$excess_returns,
    Fitted = fitted_values
  )
  ar_model_summary <- coef(summary(best_AR))
  
  MSFE <- sqrt(mean(best_AR$residuals^2))
  
  # Plot data
  plot <- ggplot(plot_data, aes(x = Date)) +
    geom_line(aes(y = ExcessReturns), color = "black") +
    geom_line(aes(y = Fitted), color = "red") +
    labs(title = paste("Time Series with AR(", best_p, ") Fitted Values")) +
    theme_minimal()
  
  result <- list(best_p = best_p, bic_values = bic_values, AR_model = best_AR, AR_model_summary = ar_model_summary, MSFE = MSFE, plot = plot)
  
  return(result)
}
result <- ar_modelling(input = ts_quarter)
print(paste("Best AR(p) order:", result$best_p))
## [1] "Best AR(p) order: 4"
print(paste("BIC values for p =", 1:5, ":", result$bic_values))
## [1] "BIC values for p = 1 : -1115.64419869863"
## [2] "BIC values for p = 2 : -1113.64972456684"
## [3] "BIC values for p = 3 : -1128.31218224036"
## [4] "BIC values for p = 4 : -1136.67976421326"
## [5] "BIC values for p = 5 : -1134.73875396023"
summary(result$AR_model)
## 
## Call:
## arima(x = data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
## 
## Coefficients:
##          ar1     ar2     ar3      ar4  intercept
##       0.0011  0.0074  0.1617  -0.1302     0.0065
## s.e.  0.0402  0.0397  0.0397   0.0403     0.0040
## 
## sigma^2 estimated as 0.008822:  log likelihood = 574.34,  aic = -1136.68
## 
## Training set error measures:
##               ME RMSE MAE MPE MAPE
## Training set NaN  NaN NaN NaN  NaN
print(paste("MSFE:", result$MSFE))
## [1] "MSFE: 0.0939251511495606"
print(result$plot)

result <- ar_modelling(input = ts_year)
print(paste("Best AR(p) order:", result$best_p))
## [1] "Best AR(p) order: 2"
print(paste("BIC values for p =", 1:5, ":", result$bic_values))
## [1] "BIC values for p = 1 : -74.5693588331313"
## [2] "BIC values for p = 2 : -77.0878739547895"
## [3] "BIC values for p = 3 : -76.194208003193" 
## [4] "BIC values for p = 4 : -74.8766921143495"
## [5] "BIC values for p = 5 : -76.9190271692618"
summary(result$AR_model)
## 
## Call:
## arima(x = data_cleaned$excess_returns, order = c(p, 0, 0), method = "ML")
## 
## Coefficients:
##          ar1      ar2  intercept
##       0.0318  -0.1735     0.0256
## s.e.  0.0804   0.0810     0.0130
## 
## sigma^2 estimated as 0.03331:  log likelihood = 42.54,  aic = -77.09
## 
## Training set error measures:
##               ME RMSE MAE MPE MAPE
## Training set NaN  NaN NaN NaN  NaN
print(paste("MSFE:", result$MSFE))
## [1] "MSFE: 0.182520762925912"
print(result$plot)

f. Linear predictive model

regressors_all <- union(union(names(ts_year), names(tsdata)), names(ts_quarter)) %>% setdiff(c("risk_free", "sp_index", "returns", "excess_returns", "time"))
regressors_monthly <- names(tsdata) %>% setdiff(c( "risk_free","sp_index", "returns", "excess_returns", "time"))
regressors_quarterly <- names(ts_quarter) %>% setdiff(c( "risk_free","sp_index", "returns", "excess_returns","time"))
regressors_annual <- names(ts_year) %>% setdiff(c("risk_free","sp_index", "returns", "excess_returns", "time"))
lagged_regressors_all <- paste0("lag(", regressors_all, ")")
lagged_regressors_monthly <- paste0("lag(", regressors_monthly, ")")
lagged_regressors_quarterly <- paste0("lag(", regressors_quarterly, ")")
lagged_regressors_annual <- paste0("lag(", regressors_annual, ")")

MSFE_df <- data.frame(monthly=rep(NA,20),
                      quarterly=rep(NA,20),
                      annual=rep(NA,20),
                      row.names = c(regressors_all,"all predictors", "AR(p)"))

datasets <- list(tsdata, ts_quarter, ts_year)

Models monthly

# Create an empty data frame to store MSFE values
MSFE_df_monthly <- data.frame(Model = c(regressors_monthly, "all predictors", "AR(p)"), 
                              MSFE = rep(NA, length(regressors_monthly) + 2))

for (i in 1:length(MSFE_df_monthly$Model)) {
  if (MSFE_df_monthly$Model[i] == "AR(p)") {
    # Try to calculate MSFE for AR(p) model
    tryCatch({
      ar_result <- ar_modelling(input = tsdata)
      MSFE_df_monthly$MSFE[i] <- ar_result$MSFE_AIC
    }, error = function(e) {
      cat("Warning: AR(p) model did not converge or fit successfully.\n")
    })
  } else if (MSFE_df_monthly$Model[i] == "all predictors") {
    # LM for all predictors
    formula_all_predictors1 <- paste("excess_returns ~", paste(lagged_regressors_monthly, collapse = " + ")) %>% as.formula()
    model_all_predictors <- lm(formula_all_predictors1, data = tsdata)
    MSFE_df_monthly$MSFE[i] <- mean(model_all_predictors$residuals^2)^0.5
  } else {
    # LM for single predictor
    formula <- paste("excess_returns ~ ", MSFE_df_monthly$Model[i], sep = "") %>% as.formula()
    model <- lm(formula, data = tsdata)
    MSFE_df_monthly$MSFE[i] <- mean(model$residuals^2)^0.5
  }
}
## Warning: AR(p) model did not converge or fit successfully.
# Print or return the MSFE data frame
print(MSFE_df_monthly)
##             Model       MSFE
## 1     dividend_12 0.04740964
## 2      earning_12 0.04741400
## 3     book_market 0.05292828
## 4   treasury_bill 0.05292782
## 5         cor_AAA 0.05285853
## 6         cor_BAA 0.05285202
## 7        lt_yeild 0.05284913
## 8      net_equity 0.05400716
## 9       inflation 0.05201274
## 10  lt_returnrate 0.05385217
## 11   cbond_return 0.05298089
## 12      stock_var 0.04758575
## 13     cs_premium 0.04606330
## 14 all predictors 0.04470067
## 15          AR(p)         NA
# Create an empty data frame to store MSFE values
MSFE_df_quarterly <- data.frame(Model = c(regressors_quarterly, "all predictors", "AR(p)"), MSFE = rep(NA, length(regressors_quarterly) + 2))

for (i in 1:length(MSFE_df_quarterly$Model)) {
  if (MSFE_df_quarterly$Model[i] == "AR(p)") {
    # Try to calculate MSFE for AR(p) model
    tryCatch({
      ar_result <- ar_modelling(input = ts_quarter)
      MSFE_df_quarterly$MSFE[i] <- ar_result$MSFE_AIC
    }, error = function(e) {
      cat("Warning: AR(p) model did not converge or fit successfully.\n")
    })
  } else if (MSFE_df_quarterly$Model[i] == "all predictors") {
    # LM for all predictors
    formula_all_predictors <- paste("excess_returns ~", paste(lagged_regressors_quarterly, collapse = " + ")) %>% as.formula()
    model_all_predictors <- lm(formula_all_predictors, data = ts_quarter)
    MSFE_df_quarterly$MSFE[i] <- mean(model_all_predictors$residuals^2)^0.5
  } else {
    # LM for single predictor
    formula <- paste("excess_returns ~ ", MSFE_df_quarterly$Model[i], sep = "") %>% as.formula()
    model <- lm(formula, data = ts_quarter)
    MSFE_df_quarterly$MSFE[i] <- mean(model$residuals^2)^0.5
  }
}
## Warning: AR(p) model did not converge or fit successfully.
# Print or return the MSFE data frame
print(MSFE_df_quarterly)
##                      Model       MSFE
## 1              dividend_12 0.09595532
## 2               earning_12 0.09596016
## 3              book_market 0.10582089
## 4            treasury_bill 0.10619995
## 5                  cor_AAA 0.10601085
## 6                  cor_BAA 0.10572017
## 7                 lt_yeild 0.10609422
## 8  consum_weal_incomeratio 0.07902971
## 9               net_equity 0.10873910
## 10               inflation 0.10465406
## 11           lt_returnrate 0.10875313
## 12            cbond_return 0.10705539
## 13               stock_var 0.09654196
## 14              cs_premium 0.08306658
## 15 investment_capitalratio 0.07619183
## 16           dividend3year 0.07811759
## 17            earning3year 0.08351558
## 18          all predictors 0.05974119
## 19                   AR(p)         NA
MSFE_df_annual <- data.frame(Model = c(regressors_monthly, "all predictors", "AR(p)"), 
                             MSFE = rep(NA, length(regressors_monthly) + 2))


#annual data
for (i in 1:length(MSFE_df_annual$Model)) {
  if (MSFE_df_annual$Model[i] == "AR(p)") {
    # Try to calculate MSFE for AR(p) model
    tryCatch({
      ar_result <- ar_modelling(input = ts_year)
      MSFE_df_annual$MSFE[i] <- ar_result$MSFE_AIC
    }, error = function(e) {
      cat("Warning: AR(p) model did not converge or fit successfully.\n")
    })
  } else if (MSFE_df_annual$Model[i] == "all predictors") {
    # LM for all predictors
    formula_all_predictors <- paste("excess_returns ~", paste(lagged_regressors_annual, collapse = " + ")) %>% as.formula()
    model_all_predictors <- lm(formula_all_predictors, data = ts_year)
    MSFE_df_annual$MSFE[i] <- mean(model_all_predictors$residuals^2)^0.5
  } else {
    # LM for single predictor
    formula <- paste("excess_returns ~ ", MSFE_df_annual$Model[i], sep = "") %>% as.formula()
    model <- lm(formula, data = ts_year)
    MSFE_df_annual$MSFE[i] <- mean(model$residuals^2)^0.5
  }
}
## Warning: AR(p) model did not converge or fit successfully.
# Print or return the MSFE data frame
print(MSFE_df_annual)
##             Model      MSFE
## 1     dividend_12 0.1842201
## 2      earning_12 0.1833799
## 3     book_market 0.1812140
## 4   treasury_bill 0.1899635
## 5         cor_AAA 0.1882981
## 6         cor_BAA 0.1842969
## 7        lt_yeild 0.1892734
## 8      net_equity 0.1933174
## 9       inflation 0.1913295
## 10  lt_returnrate 0.1932781
## 11   cbond_return 0.1904286
## 12      stock_var 0.1805771
## 13     cs_premium 0.1727220
## 14 all predictors 0.1355022
## 15          AR(p)        NA
# MSFE_df_monthly
# MSFE_df_quarterly
# MSFE_df_annual
merged_12 <- merge(MSFE_df_monthly, MSFE_df_quarterly, by="row.names", all=TRUE)
rownames(merged_12) <- merged_12$Row.names
merged_12 %<>% select(-Row.names)
final_merged <- merge(merged_12, MSFE_df_annual, by="row.names", all=TRUE)
rownames(final_merged) <- final_merged$Row.names
final_merged %<>% select(-Row.names)
final_merged
##           Model.x     MSFE.x                 Model.y     MSFE.y          Model
## 1     dividend_12 0.04740964             dividend_12 0.09595532    dividend_12
## 10  lt_returnrate 0.05385217               inflation 0.10465406  lt_returnrate
## 11   cbond_return 0.05298089           lt_returnrate 0.10875313   cbond_return
## 12      stock_var 0.04758575            cbond_return 0.10705539      stock_var
## 13     cs_premium 0.04606330               stock_var 0.09654196     cs_premium
## 14 all predictors 0.04470067              cs_premium 0.08306658 all predictors
## 15          AR(p)         NA investment_capitalratio 0.07619183          AR(p)
## 16           <NA>         NA           dividend3year 0.07811759           <NA>
## 17           <NA>         NA            earning3year 0.08351558           <NA>
## 18           <NA>         NA          all predictors 0.05974119           <NA>
## 19           <NA>         NA                   AR(p)         NA           <NA>
## 2      earning_12 0.04741400              earning_12 0.09596016     earning_12
## 3     book_market 0.05292828             book_market 0.10582089    book_market
## 4   treasury_bill 0.05292782           treasury_bill 0.10619995  treasury_bill
## 5         cor_AAA 0.05285853                 cor_AAA 0.10601085        cor_AAA
## 6         cor_BAA 0.05285202                 cor_BAA 0.10572017        cor_BAA
## 7        lt_yeild 0.05284913                lt_yeild 0.10609422       lt_yeild
## 8      net_equity 0.05400716 consum_weal_incomeratio 0.07902971     net_equity
## 9       inflation 0.05201274              net_equity 0.10873910      inflation
##         MSFE
## 1  0.1842201
## 10 0.1932781
## 11 0.1904286
## 12 0.1805771
## 13 0.1727220
## 14 0.1355022
## 15        NA
## 16        NA
## 17        NA
## 18        NA
## 19        NA
## 2  0.1833799
## 3  0.1812140
## 4  0.1899635
## 5  0.1882981
## 6  0.1842969
## 7  0.1892734
## 8  0.1933174
## 9  0.1913295
# Minimum MSFE
final_merged %>% select(MSFE.x) %>% filter(MSFE.x == min(MSFE.x, na.rm = TRUE))
##        MSFE.x
## 14 0.04470067
final_merged %>% select(MSFE.y) %>% filter(MSFE.y == min(MSFE.y, na.rm = TRUE))
##        MSFE.y
## 18 0.05974119
final_merged %>% select(MSFE) %>% filter(MSFE == min(MSFE, na.rm = TRUE))
##         MSFE
## 14 0.1355022
# Maximum MSFE
final_merged %>% select(MSFE.x) %>% filter(MSFE.x == max(MSFE.x, na.rm = TRUE))
##       MSFE.x
## 8 0.05400716
final_merged %>% select(MSFE.y) %>% filter(MSFE.y == max(MSFE.y, na.rm = TRUE))
##       MSFE.y
## 11 0.1087531
final_merged %>% select(MSFE) %>% filter(MSFE == max(MSFE, na.rm = TRUE))
##        MSFE
## 8 0.1933174

h. select predictors

colMeans(is.na(tsdata))
##           time       sp_index    dividend_12     earning_12    book_market 
##   0.0000000000   0.0000000000   0.0000000000   0.0000000000   0.3300438596 
##  treasury_bill        cor_AAA        cor_BAA       lt_yeild     net_equity 
##   0.3223684211   0.3157894737   0.3157894737   0.3157894737   0.3678728070 
##      risk_free      inflation  lt_returnrate   cbond_return      stock_var 
##   0.0005482456   0.2768640351   0.3618421053   0.3618421053   0.0926535088 
##     cs_premium        returns excess_returns 
##   0.5679824561   0.0005482456   0.0005482456
colMeans(is.na(ts_quarter))
##                    time                sp_index             dividend_12 
##             0.000000000             0.000000000             0.000000000 
##              earning_12             book_market           treasury_bill 
##             0.000000000             0.328947368             0.322368421 
##                 cor_AAA                 cor_BAA                lt_yeild 
##             0.315789474             0.315789474             0.315789474 
## consum_weal_incomeratio              net_equity               risk_free 
##             0.532894737             0.366776316             0.001644737 
##               inflation           lt_returnrate            cbond_return 
##             0.277960526             0.361842105             0.361842105 
##               stock_var              cs_premium investment_capitalratio 
##             0.092105263             0.567434211             0.500000000 
##           dividend3year            earning3year                 returns 
##             0.769736842             0.421052632             0.001644737 
##          excess_returns 
##             0.001644737
colMeans(is.na(ts_year))
##                    time                sp_index             dividend_12 
##             0.000000000             0.000000000             0.000000000 
##              earning_12             book_market           treasury_bill 
##             0.000000000             0.328947368             0.322368421 
##                 cor_AAA                 cor_BAA                lt_yeild 
##             0.315789474             0.315789474             0.315789474 
## consum_weal_incomeratio              net_equity               risk_free 
##             0.486842105             0.361842105             0.000000000 
##               inflation          equity_issuing           lt_returnrate 
##             0.282894737             0.368421053             0.361842105 
##            cbond_return               stock_var              cs_premium 
##             0.361842105             0.092105263             0.565789474 
## investment_capitalratio                 returns          excess_returns 
##             0.500000000             0.006578947             0.006578947
formula_all_predictors_monthly <- paste("excess_returns ~", paste(lagged_regressors_monthly, collapse=" + ")) %>% as.formula()
full_model_monthly <- lm(formula_all_predictors_monthly, data=tsdata)  # Fit a model with all predictors 
step_model_monthly <- stats::step(full_model_monthly, direction="both", trace=0, k=2)  # Stepwise regression with BIC
summary(step_model_monthly)
## 
## Call:
## lm(formula = excess_returns ~ lag(dividend_12) + lag(book_market) + 
##     lag(cor_BAA) + lag(lt_yeild) + lag(net_equity) + lag(inflation) + 
##     lag(cbond_return) + lag(cs_premium), data = tsdata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.245787 -0.025638  0.001655  0.028565  0.232978 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.0164892  0.0088653  -1.860 0.063267 .  
## lag(dividend_12)   0.0026003  0.0006924   3.756 0.000186 ***
## lag(book_market)   0.0473852  0.0131186   3.612 0.000323 ***
## lag(cor_BAA)      -0.8899889  0.2951611  -3.015 0.002651 ** 
## lag(lt_yeild)      0.8316425  0.3346388   2.485 0.013157 *  
## lag(net_equity)   -0.2693967  0.1167764  -2.307 0.021319 *  
## lag(inflation)    -1.3707488  0.3561847  -3.848 0.000129 ***
## lag(cbond_return)  0.2461855  0.0855536   2.878 0.004117 ** 
## lag(cs_premium)    3.8845272  1.0727422   3.621 0.000312 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04504 on 779 degrees of freedom
##   (1036 observations deleted due to missingness)
## Multiple R-squared:  0.06139,    Adjusted R-squared:  0.05175 
## F-statistic: 6.369 on 8 and 779 DF,  p-value: 5.003e-08
mean(step_model_monthly$residuals^2)^0.5
## [1] 0.04478429
subset_quarterly <- na.omit(ts_quarter)
formula_all_predictors_quarterly <- paste("excess_returns ~", paste(lagged_regressors_quarterly, collapse = " + ")) %>% as.formula()

# Fit a model with all predictors
full_model_quarterly <- lm(formula_all_predictors_quarterly, data = subset_quarterly)

# Stepwise regression
step_model_quarterly <- stats::step(full_model_quarterly, direction = "both", trace = 0, k = 2)

# Print summary of the stepwise model
summary(step_model_quarterly)
## 
## Call:
## lm(formula = excess_returns ~ lag(earning_12) + lag(cor_BAA) + 
##     lag(lt_yeild) + lag(consum_weal_incomeratio) + lag(cbond_return) + 
##     lag(stock_var) + lag(cs_premium) + lag(investment_capitalratio), 
##     data = subset_quarterly)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17223 -0.04043  0.00524  0.03281  0.14851 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -0.540389   0.298095  -1.813 0.075867 .  
## lag(earning_12)              -0.007373   0.003677  -2.005 0.050386 .  
## lag(cor_BAA)                 -7.410718   3.444017  -2.152 0.036270 *  
## lag(lt_yeild)                 8.788780   3.547951   2.477 0.016669 *  
## lag(consum_weal_incomeratio)  3.039566   1.522820   1.996 0.051394 .  
## lag(cbond_return)             0.506399   0.321821   1.574 0.121901    
## lag(stock_var)                9.859165   2.678343   3.681 0.000569 ***
## lag(cs_premium)              33.920562  19.619596   1.729 0.089996 .  
## lag(investment_capitalratio) 18.247261   9.543994   1.912 0.061626 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06775 on 50 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3256, Adjusted R-squared:  0.2177 
## F-statistic: 3.017 on 8 and 50 DF,  p-value: 0.00764
# Calculate RMSE
MSFE_f <- sqrt(mean(step_model_quarterly$residuals^2))
MSFE_f
## [1] 0.06237151
subset_year <- na.omit(ts_year)
formula_all_predictors_annual <- paste("excess_returns ~", paste(lagged_regressors_annual, collapse = " + ")) %>% as.formula()

# Fit a model with all predictors
full_model_annual <- lm(formula_all_predictors_annual, data = subset_year)

# Stepwise regression
step_model_annual <- stats::step(full_model_annual, direction = "both", trace = 0, k = 2)

# Print summary of the stepwise model
summary(step_model_annual)
## 
## Call:
## lm(formula = excess_returns ~ lag(earning_12) + lag(book_market) + 
##     lag(inflation) + lag(equity_issuing) + lag(investment_capitalratio), 
##     data = subset_year)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.282627 -0.112904  0.007809  0.098624  0.313350 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                    0.359164   0.279793   1.284  0.20529   
## lag(earning_12)                0.005239   0.002589   2.023  0.04852 * 
## lag(book_market)               0.520536   0.161828   3.217  0.00230 **
## lag(inflation)                -2.314709   0.998531  -2.318  0.02466 * 
## lag(equity_issuing)           -0.899539   0.320321  -2.808  0.00713 **
## lag(investment_capitalratio) -11.664364   7.604887  -1.534  0.13151   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1501 on 49 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2956, Adjusted R-squared:  0.2237 
## F-statistic: 4.112 on 5 and 49 DF,  p-value: 0.003387
# Calculate RMSE
MSFE_f <- sqrt(mean(step_model_annual$residuals^2))
MSFE_f
## [1] 0.1416521

i. Forward stepwise model selection

data_monthly_cleaned <- na.omit(tsdata)
null_model <- lm(excess_returns ~ 1, data = data_monthly_cleaned)
current_formula <- "excess_returns ~ 1"
selected_predictors <- c()
min_AIC <- AIC(null_model)

# Repeat until no more predictors can be added
repeat {
  aics <- numeric(length(lagged_regressors_monthly))
  
  for(i in 1:length(lagged_regressors_monthly)) {
    # Skip if predictor already selected
    if (lagged_regressors_monthly[i] %in% selected_predictors) next
    
    new_formula_string <- paste(current_formula, "+", lagged_regressors_monthly[i])
    new_formula <- as.formula(new_formula_string)
    
    new_model <- lm(new_formula, data = data_monthly_cleaned)
    aics[i] <- AIC(new_model)
  }
  
  # Identify predictor with smallest AIC
  min_index <- which.min(aics)
  
  # If the new AIC is smaller, add predictor to model. Otherwise, stop.
  if (aics[min_index] < min_AIC) {
    selected_predictors <- c(selected_predictors, lagged_regressors_monthly[min_index])
    current_formula <- paste(current_formula, "+", lagged_regressors_monthly[min_index])
    min_AIC <- aics[min_index]
  } else {
    break
  }
}

# Print the final model formula
print(paste("Final model formula:", current_formula))
## [1] "Final model formula: excess_returns ~ 1 + lag(inflation) + lag(cs_premium) + lag(cbond_return) + lag(net_equity) + lag(stock_var) + lag(dividend_12) + lag(book_market) + lag(cor_BAA) + lag(lt_yeild) + lag(treasury_bill)"
data_monthly_cleaned <- na.omit(ts_quarter)
null_model <- lm(excess_returns ~ 1, data = data_monthly_cleaned)
current_formula <- "excess_returns ~ 1"
selected_predictors <- c()
min_AIC <- AIC(null_model)

# Repeat until no more predictors can be added
repeat {
  aics <- numeric(length(lagged_regressors_quarterly))
  
  for(i in 1:length(lagged_regressors_quarterly)) {
    # Skip if predictor already selected
    if (lagged_regressors_quarterly[i] %in% selected_predictors) next
    
    new_formula_string <- paste(current_formula, "+", lagged_regressors_quarterly[i])
    new_formula <- as.formula(new_formula_string)
    
    new_model <- lm(new_formula, data = data_monthly_cleaned)
    aics[i] <- AIC(new_model)
  }
  
  # Identify predictor with smallest AIC
  min_index <- which.min(aics)
  
  # If the new AIC is smaller, add predictor to model. Otherwise, stop.
  if (aics[min_index] < min_AIC) {
    selected_predictors <- c(selected_predictors, lagged_regressors_quarterly[min_index])
    current_formula <- paste(current_formula, "+", lagged_regressors_quarterly[min_index])
    min_AIC <- aics[min_index]
  } else {
    break
  }
}

# Print the final model formula
print(paste("Final model formula:", current_formula))
## [1] "Final model formula: excess_returns ~ 1"
data_monthly_cleaned <- na.omit(ts_year)
null_model <- lm(excess_returns ~ 1, data = data_monthly_cleaned)
current_formula <- "excess_returns ~ 1"
selected_predictors <- c()
min_AIC <- AIC(null_model)

# Repeat until no more predictors can be added
repeat {
  aics <- numeric(length(lagged_regressors_annual))
  
  for(i in 1:length(lagged_regressors_annual)) {
    # Skip if predictor already selected
    if (lagged_regressors_annual[i] %in% selected_predictors) next
    
    new_formula_string <- paste(current_formula, "+", lagged_regressors_annual[i])
    new_formula <- as.formula(new_formula_string)
    
    new_model <- lm(new_formula, data = data_monthly_cleaned)
    aics[i] <- AIC(new_model)
  }
  
  # Identify predictor with smallest AIC
  min_index <- which.min(aics)
  
  # If the new AIC is smaller, add predictor to model. Otherwise, stop.
  if (aics[min_index] < min_AIC) {
    selected_predictors <- c(selected_predictors, lagged_regressors_annual[min_index])
    current_formula <- paste(current_formula, "+", lagged_regressors_annual[min_index])
    min_AIC <- aics[min_index]
  } else {
    break
  }
}

# Print the final model formula
print(paste("Final model formula:", current_formula))
## [1] "Final model formula: excess_returns ~ 1 + lag(investment_capitalratio) + lag(equity_issuing) + lag(book_market) + lag(inflation) + lag(earning_12)"